import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
!pip install ydata-profiling
Requirement already satisfied: ydata-profiling in c:\users\sylvia.pereira\anaconda3\lib\site-packages (4.1.0) Requirement already satisfied: matplotlib<3.7,>=3.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (3.5.2) Requirement already satisfied: imagehash==4.3.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (4.3.1) Requirement already satisfied: statsmodels<0.14,>=0.13.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.13.2) Requirement already satisfied: visions[type_image_path]==0.7.5 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.7.5) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (6.0) Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.12.3) Requirement already satisfied: scipy<1.10,>=1.4.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.9.1) Requirement already satisfied: pydantic<1.11,>=1.8.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.10.6) Requirement already satisfied: multimethod<1.10,>=1.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.9.1) Requirement already satisfied: tqdm<4.65,>=4.48.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (4.64.1) Requirement already satisfied: seaborn<0.13,>=0.10.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.11.2) Requirement already satisfied: numpy<1.24,>=1.16.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.21.5) Requirement already satisfied: htmlmin==0.1.12 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.1.12) Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.11.3) Requirement already satisfied: typeguard<2.14,>=2.13.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.13.3) Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.4.4) Requirement already satisfied: requests<2.29,>=2.24.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.28.1) Requirement already satisfied: pillow in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (9.2.0) Requirement already satisfied: PyWavelets in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (1.3.0) Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (0.2.0) Requirement already satisfied: attrs>=19.3.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (21.4.0) Requirement already satisfied: networkx>=2.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (2.8.4) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.0.1) Requirement already satisfied: python-dateutil>=2.7 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (1.4.2) Requirement already satisfied: cycler>=0.10 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (0.11.0) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (3.0.9) Requirement already satisfied: fonttools>=4.22.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (4.25.0) Requirement already satisfied: packaging>=20.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (21.3) Requirement already satisfied: pytz>=2020.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata-profiling) (2022.1) Requirement already satisfied: joblib>=0.14.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.1.0) Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from pydantic<1.11,>=1.8.1->ydata-profiling) (4.3.0) Requirement already satisfied: idna<4,>=2.5 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (3.3) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (2.0.4) Requirement already satisfied: certifi>=2017.4.17 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (2022.9.14) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (1.26.11) Requirement already satisfied: patsy>=0.5.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from statsmodels<0.14,>=0.13.2->ydata-profiling) (0.5.2) Requirement already satisfied: colorama in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from tqdm<4.65,>=4.48.2->ydata-profiling) (0.4.5) Requirement already satisfied: six in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from patsy>=0.5.2->statsmodels<0.14,>=0.13.2->ydata-profiling) (1.16.0)
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv(r'C:\Users\sylvia.pereira\OneDrive - alteryx.com\Desktop\Dataset_EDA_Combined_4.csv')
df.head()
| ID_Student | course_year_month | pass_course | dataplus | dualpane | externalquiz | folder | forumng | glossary | homepage | ... | ouelluminate | ouwiki | page | questionnaire | quiz | repeatactivity | resource | sharedsubpage | subpage | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 560374 | GGG 2013_October | True | 0 | 0 | 0 | 0 | 0 | 0 | 51 | ... | 0 | 0 | 0 | 0 | 65 | 0 | 5 | 0 | 8 | 0 |
| 1 | 519684 | EEE 2013_October | True | 0 | 2 | 0 | 0 | 525 | 0 | 706 | ... | 0 | 193 | 0 | 0 | 371 | 0 | 77 | 0 | 215 | 60 |
| 2 | 570529 | GGG 2013_October | True | 0 | 0 | 0 | 0 | 45 | 2 | 275 | ... | 0 | 0 | 0 | 0 | 158 | 0 | 77 | 0 | 15 | 0 |
| 3 | 643653 | FFF 2014_October | True | 0 | 0 | 0 | 0 | 411 | 0 | 470 | ... | 0 | 14 | 5 | 19 | 1001 | 0 | 20 | 0 | 254 | 37 |
| 4 | 678680 | BBB 2014_October | True | 0 | 0 | 0 | 0 | 109 | 4 | 235 | ... | 0 | 0 | 0 | 0 | 118 | 0 | 84 | 0 | 26 | 0 |
5 rows × 23 columns
df.describe()
| ID_Student | dataplus | dualpane | externalquiz | folder | forumng | glossary | homepage | htmlactivity | oucollaborate | ... | ouelluminate | ouwiki | page | questionnaire | quiz | repeatactivity | resource | sharedsubpage | subpage | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.907700e+04 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | ... | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 | 19077.000000 |
| mean | 6.975854e+05 | 1.815852 | 0.898779 | 2.927504 | 0.266132 | 319.979032 | 3.505321 | 294.471143 | 0.307019 | 4.781098 | ... | 1.762017 | 40.021754 | 2.577607 | 2.504010 | 317.310321 | 0.000262 | 49.663784 | 0.006395 | 147.869319 | 24.287886 |
| std | 5.443597e+05 | 7.233321 | 3.119043 | 10.097041 | 0.980568 | 636.726445 | 31.645296 | 362.405249 | 1.440269 | 12.927816 | ... | 9.147729 | 96.031117 | 6.254059 | 7.306026 | 551.266264 | 0.026104 | 96.579696 | 0.099595 | 187.656337 | 40.288575 |
| min | 6.516000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 5.034910e+05 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 34.000000 | 0.000000 | 88.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 25.000000 | 3.000000 |
| 50% | 5.867620e+05 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 132.000000 | 0.000000 | 194.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 95.000000 | 0.000000 | 31.000000 | 0.000000 | 76.000000 | 12.000000 |
| 75% | 6.349280e+05 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 352.000000 | 0.000000 | 379.000000 | 0.000000 | 4.000000 | ... | 0.000000 | 34.000000 | 2.000000 | 0.000000 | 485.000000 | 0.000000 | 60.000000 | 0.000000 | 212.000000 | 32.000000 |
| max | 2.698577e+06 | 143.000000 | 69.000000 | 340.000000 | 13.000000 | 13154.000000 | 1364.000000 | 8543.000000 | 33.000000 | 316.000000 | ... | 317.000000 | 2117.000000 | 334.000000 | 65.000000 | 13032.000000 | 3.000000 | 5147.000000 | 4.000000 | 4346.000000 | 2134.000000 |
8 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19077 entries, 0 to 19076 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID_Student 19077 non-null int64 1 course_year_month 19077 non-null object 2 pass_course 19077 non-null bool 3 dataplus 19077 non-null int64 4 dualpane 19077 non-null int64 5 externalquiz 19077 non-null int64 6 folder 19077 non-null int64 7 forumng 19077 non-null int64 8 glossary 19077 non-null int64 9 homepage 19077 non-null int64 10 htmlactivity 19077 non-null int64 11 oucollaborate 19077 non-null int64 12 oucontent 19077 non-null int64 13 ouelluminate 19077 non-null int64 14 ouwiki 19077 non-null int64 15 page 19077 non-null int64 16 questionnaire 19077 non-null int64 17 quiz 19077 non-null int64 18 repeatactivity 19077 non-null int64 19 resource 19077 non-null int64 20 sharedsubpage 19077 non-null int64 21 subpage 19077 non-null int64 22 url 19077 non-null int64 dtypes: bool(1), int64(21), object(1) memory usage: 3.2+ MB
df.isna().sum()
ID_Student 0 course_year_month 0 pass_course 0 dataplus 0 dualpane 0 externalquiz 0 folder 0 forumng 0 glossary 0 homepage 0 htmlactivity 0 oucollaborate 0 oucontent 0 ouelluminate 0 ouwiki 0 page 0 questionnaire 0 quiz 0 repeatactivity 0 resource 0 sharedsubpage 0 subpage 0 url 0 dtype: int64
df = df.drop('ID_Student', 1)
print(df['pass_course'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['pass_course'], color='g', bins=100, hist_kws={'alpha': 0.4});
count 19077 unique 2 top True freq 12358 Name: pass_course, dtype: object
df.corr()
| pass_course | dataplus | dualpane | externalquiz | folder | forumng | glossary | homepage | htmlactivity | oucollaborate | ... | ouelluminate | ouwiki | page | questionnaire | quiz | repeatactivity | resource | sharedsubpage | subpage | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| pass_course | 1.000000 | 0.154812 | 0.100451 | 0.087266 | 0.128947 | 0.203948 | 0.048854 | 0.295290 | 0.054852 | 0.156591 | ... | 0.045744 | 0.174393 | 0.091318 | 0.163362 | 0.210180 | -0.005209 | 0.153820 | 0.010985 | 0.245619 | 0.201365 |
| dataplus | 0.154812 | 1.000000 | 0.454758 | -0.072248 | 0.264993 | 0.174257 | -0.015104 | 0.322993 | 0.282659 | 0.047325 | ... | 0.169203 | 0.041279 | 0.552618 | 0.752077 | 0.316170 | -0.002521 | 0.048457 | -0.016120 | 0.419391 | 0.230975 |
| dualpane | 0.100451 | 0.454758 | 1.000000 | -0.081658 | 0.258780 | 0.151819 | -0.022173 | 0.282362 | 0.012916 | 0.005734 | ... | 0.224996 | 0.189079 | 0.502815 | 0.544677 | 0.292210 | 0.004833 | 0.082508 | -0.018504 | 0.342631 | 0.221684 |
| externalquiz | 0.087266 | -0.072248 | -0.081658 | 1.000000 | -0.077683 | 0.149212 | 0.171191 | 0.285223 | -0.061260 | 0.300073 | ... | 0.225364 | 0.209246 | -0.069060 | -0.098530 | -0.069656 | -0.002911 | 0.223869 | -0.018618 | 0.424413 | 0.349830 |
| folder | 0.128947 | 0.264993 | 0.258780 | -0.077683 | 1.000000 | 0.071099 | -0.019212 | 0.187807 | -0.053961 | 0.039887 | ... | -0.045589 | -0.021863 | 0.442765 | 0.328777 | 0.330940 | 0.009563 | 0.054474 | -0.017428 | 0.308711 | 0.085384 |
| forumng | 0.203948 | 0.174257 | 0.151819 | 0.149212 | 0.071099 | 1.000000 | 0.057331 | 0.731877 | 0.068599 | 0.221122 | ... | 0.125443 | 0.230668 | 0.156718 | 0.156156 | 0.221318 | 0.018476 | 0.191994 | 0.102713 | 0.383362 | 0.418783 |
| glossary | 0.048854 | -0.015104 | -0.022173 | 0.171191 | -0.019212 | 0.057331 | 1.000000 | 0.096357 | -0.018317 | 0.103250 | ... | 0.038345 | 0.038040 | -0.022357 | -0.018049 | -0.024958 | -0.000160 | 0.086778 | 0.000189 | 0.152831 | 0.080709 |
| homepage | 0.295290 | 0.322993 | 0.282362 | 0.285223 | 0.187807 | 0.731877 | 0.096357 | 1.000000 | 0.142335 | 0.301454 | ... | 0.230564 | 0.362058 | 0.350026 | 0.336082 | 0.463731 | 0.024496 | 0.332555 | 0.042075 | 0.687295 | 0.633643 |
| htmlactivity | 0.054852 | 0.282659 | 0.012916 | -0.061260 | -0.053961 | 0.068599 | -0.018317 | 0.142335 | 1.000000 | 0.077830 | ... | -0.039765 | -0.010692 | 0.129467 | 0.285169 | 0.215290 | 0.000648 | 0.002951 | -0.013688 | 0.228946 | 0.074682 |
| oucollaborate | 0.156591 | 0.047325 | 0.005734 | 0.300073 | 0.039887 | 0.221122 | 0.103250 | 0.301454 | 0.077830 | 1.000000 | ... | -0.062475 | 0.116357 | 0.032593 | 0.058659 | 0.108873 | 0.000947 | 0.219304 | -0.020573 | 0.335250 | 0.191467 |
| oucontent | 0.264943 | 0.590808 | 0.506918 | -0.082768 | 0.356134 | 0.239202 | -0.018528 | 0.533206 | 0.288287 | 0.126064 | ... | 0.168310 | 0.275083 | 0.616742 | 0.672447 | 0.530083 | 0.011278 | 0.155628 | -0.040648 | 0.569371 | 0.350041 |
| ouelluminate | 0.045744 | 0.169203 | 0.224996 | 0.225364 | -0.045589 | 0.125443 | 0.038345 | 0.230564 | -0.039765 | -0.062475 | ... | 1.000000 | 0.094605 | 0.249521 | 0.199361 | 0.122839 | -0.001934 | 0.083152 | -0.003796 | 0.310643 | 0.267894 |
| ouwiki | 0.174393 | 0.041279 | 0.189079 | 0.209246 | -0.021863 | 0.230668 | 0.038040 | 0.362058 | -0.010692 | 0.116357 | ... | 0.094605 | 1.000000 | 0.056819 | 0.025240 | 0.166998 | -0.001947 | 0.160084 | -0.026762 | 0.248239 | 0.436978 |
| page | 0.091318 | 0.552618 | 0.502815 | -0.069060 | 0.442765 | 0.156718 | -0.022357 | 0.350026 | 0.129467 | 0.032593 | ... | 0.249521 | 0.056819 | 1.000000 | 0.643555 | 0.462047 | 0.033752 | 0.106484 | -0.026466 | 0.526778 | 0.240914 |
| questionnaire | 0.163362 | 0.752077 | 0.544677 | -0.098530 | 0.328777 | 0.156156 | -0.018049 | 0.336082 | 0.285169 | 0.058659 | ... | 0.199361 | 0.025240 | 0.643555 | 1.000000 | 0.383571 | 0.005354 | 0.062417 | -0.022008 | 0.458479 | 0.198547 |
| quiz | 0.210180 | 0.316170 | 0.292210 | -0.069656 | 0.330940 | 0.221318 | -0.024958 | 0.463731 | 0.215290 | 0.108873 | ... | 0.122839 | 0.166998 | 0.462047 | 0.383571 | 1.000000 | 0.039301 | 0.209464 | -0.024411 | 0.518357 | 0.238911 |
| repeatactivity | -0.005209 | -0.002521 | 0.004833 | -0.002911 | 0.009563 | 0.018476 | -0.000160 | 0.024496 | 0.000648 | 0.000947 | ... | -0.001934 | -0.001947 | 0.033752 | 0.005354 | 0.039301 | 1.000000 | 0.002780 | -0.000645 | 0.023315 | 0.011293 |
| resource | 0.153820 | 0.048457 | 0.082508 | 0.223869 | 0.054474 | 0.191994 | 0.086778 | 0.332555 | 0.002951 | 0.219304 | ... | 0.083152 | 0.160084 | 0.106484 | 0.062417 | 0.209464 | 0.002780 | 1.000000 | 0.012213 | 0.372256 | 0.253894 |
| sharedsubpage | 0.010985 | -0.016120 | -0.018504 | -0.018618 | -0.017428 | 0.102713 | 0.000189 | 0.042075 | -0.013688 | -0.020573 | ... | -0.003796 | -0.026762 | -0.026466 | -0.022008 | -0.024411 | -0.000645 | 0.012213 | 1.000000 | -0.010316 | 0.024272 |
| subpage | 0.245619 | 0.419391 | 0.342631 | 0.424413 | 0.308711 | 0.383362 | 0.152831 | 0.687295 | 0.228946 | 0.335250 | ... | 0.310643 | 0.248239 | 0.526778 | 0.458479 | 0.518357 | 0.023315 | 0.372256 | -0.010316 | 1.000000 | 0.544838 |
| url | 0.201365 | 0.230975 | 0.221684 | 0.349830 | 0.085384 | 0.418783 | 0.080709 | 0.633643 | 0.074682 | 0.191467 | ... | 0.267894 | 0.436978 | 0.240914 | 0.198547 | 0.238911 | 0.011293 | 0.253894 | 0.024272 | 0.544838 | 1.000000 |
21 rows × 21 columns
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]